Home > sgwt_toolbox > demo > create_synthetic_dataset.m

create_synthetic_dataset

PURPOSE ^

create_synthetic_dataset creates test data for running nldr algorithms.

SYNOPSIS ^

function data = create_synthetic_dataset(data)

DESCRIPTION ^

 create_synthetic_dataset creates test data for running nldr algorithms.

 inputs:
    data      a struct describing the test data
              .dataset the number of the example, see code for more infos
              .n       the number of data points (default=400)
              .state   the initial state for the random numbers (default=0)
              .noise   the variance of Gaussian noise to add (default=0)
              other options for some of the data sets (see code)
              alternatively, data = 1 chooses the dataset directly,
              the number of points defaults to 1000

 outputs:
    data      a struct containing .x the generated data, each column is
              a data point, and other stuff:
              .z     the "correct" embedding
              .e     some random noise of same dimensionality
              .x_noisefree  the noisefree version of .x, i.e.
                     .x = .xnoise_free + sqrt(.noise) * .e

 Adapted from create.m, originally written by
 (c) Stefan Harmeling, 2006
 using the examples of the original LLE and ISOMAP code.

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function data = create_synthetic_dataset(data)
0002 % create_synthetic_dataset creates test data for running nldr algorithms.
0003 %
0004 % inputs:
0005 %    data      a struct describing the test data
0006 %              .dataset the number of the example, see code for more infos
0007 %              .n       the number of data points (default=400)
0008 %              .state   the initial state for the random numbers (default=0)
0009 %              .noise   the variance of Gaussian noise to add (default=0)
0010 %              other options for some of the data sets (see code)
0011 %              alternatively, data = 1 chooses the dataset directly,
0012 %              the number of points defaults to 1000
0013 %
0014 % outputs:
0015 %    data      a struct containing .x the generated data, each column is
0016 %              a data point, and other stuff:
0017 %              .z     the "correct" embedding
0018 %              .e     some random noise of same dimensionality
0019 %              .x_noisefree  the noisefree version of .x, i.e.
0020 %                     .x = .xnoise_free + sqrt(.noise) * .e
0021 %
0022 % Adapted from create.m, originally written by
0023 % (c) Stefan Harmeling, 2006
0024 % using the examples of the original LLE and ISOMAP code.
0025 
0026 if ~isfield(data, 'dataset'), 
0027   number = data;
0028   clear data
0029   data.dataset = number;
0030 end
0031 if ~isfield(data, 'n'), data.n = 400; end
0032 if ~isfield(data, 'noise'), data.noise = 0.0; end
0033 if ~isfield(data, 'state'), data.state = 0; end
0034 
0035 % set the randomness
0036 rand('state', data.state);
0037 randn('state', data.state);
0038 
0039 data.typ = 'data';
0040 switch data.dataset
0041  case 0 % "swiss roll with hole"
0042   data.name = 'swiss roll with hole';
0043   n = data.n;
0044   a = 1;   % swiss roll goes from a*pi to b*pi
0045   b = 4;   
0046   y = rand(2,n);
0047   % punch a rectangular hole at the center
0048   l1 = 0.05; l2 = 0.15;
0049   y = y - 0.5;
0050   ok = find((abs(y(1,:))>l1) | (abs(y(2,:))>l2));
0051   i = length(ok);
0052   y(:, 1:i) = y(:, ok);
0053   while (i<n)
0054     p = rand(2,1) - 0.5;
0055     if (abs(p(1))>l1) | (abs(p(2))>l2)
0056       i = i + 1;
0057       y(:,i) = p;
0058     end
0059   end
0060   y = y + 0.5;
0061   tt = (b-a)*y(1,:) + a;
0062   tt = pi*tt;
0063   height = 21*y(2,:);
0064   data.col = tt;
0065   data.x = [tt.*cos(tt); height; tt.*sin(tt)];
0066   data.z = [tt; height]; % the ground truth
0067   data.az = -4;
0068   data.el = 13;
0069   
0070  case -1 % "swiss roll" dataset extracted from LLE's swissroll.m
0071   data.name = 'uniform swiss roll';
0072   n = data.n;
0073   a = 1;   % swiss roll goes from a*pi to b*pi
0074   b = 4;   
0075   y = rand(2,n);
0076   data.z = y;  % the ground truth
0077   switch 1
0078    case 1
0079     % uniform distribution along the manifold (in data space)
0080     tt = sqrt((b*b-a*a)*y(1,:)+a*a);
0081    case 2
0082 %    error('do not use this case')
0083     % nonuniform distribution along the manifold (in data space)
0084     tt = (b-a)*y(1,:) + a;  
0085   end
0086   tt = pi*tt;
0087   % now tt should go from a*pi to b*pi
0088   height = 21*y(2,:);
0089   data.col = tt;
0090   data.x = [tt.*cos(tt); height; tt.*sin(tt)];
0091   data.az = -4;
0092   data.el = 13;
0093 
0094  case 1 % "swiss roll (uniform in embedding space)"
0095   % dataset extracted from LLE's swissroll.m
0096   data.name = 'classic swiss roll';
0097   n = data.n;
0098   a = 1;   % swiss roll goes from a*pi to b*pi
0099   b = 4;   
0100   y = rand(2,n);
0101   tt = (b-a)*y(1,:) + a;
0102   tt = pi*tt;
0103   height = 21*y(2,:);
0104   data.col = tt;
0105   data.x = [tt.*cos(tt); height; tt.*sin(tt)];
0106   data.z = [tt; height]; % the ground truth
0107   data.az = -4;
0108   data.el = 13;
0109   
0110  case 11 % "undersampled swiss roll"
0111   % dataset extracted from LLE's swissroll.m
0112   data.name = 'undersampled swiss roll';
0113   data.n = 100;
0114   n = data.n;
0115   a = 1;   % swiss roll goes from a*pi to b*pi
0116   b = 4;   
0117   y = rand(2,n);
0118   tt = (b-a)*y(1,:) + a;
0119   tt = pi*tt;
0120   height = 21*y(2,:);
0121   data.col = tt;
0122   data.x = [tt.*cos(tt); height; tt.*sin(tt)];
0123   data.z = [tt; height]; % the ground truth
0124   data.az = -4;
0125   data.el = 13;
0126   
0127  case 12 % "swiss roll"
0128   % dataset extracted from LLE's swissroll.m
0129   data.name = 'classic swiss roll';
0130   data.n = 400;
0131   n = data.n;
0132   a = 1;   % swiss roll goes from a*pi to b*pi
0133   b = 4;   
0134   y = rand(2,n);
0135   tt = (b-a)*y(1,:) + a;
0136   tt = pi*tt;
0137   height = 21*y(2,:);
0138   data.col = tt;
0139   data.x = [tt.*cos(tt); height; tt.*sin(tt)];
0140   data.z = [tt; height]; % the ground truth
0141   data.az = -4;
0142   data.el = 13;
0143   
0144  case 2 % "scurve" dataset extracted from LLE's scurve.m
0145   data.name = 'scurve';
0146   n = data.n;
0147   % I added 'ceil' and 'floor' to account for the case that n is odd
0148   angle = pi*(1.5*rand(1,ceil(n/2))-1); height = 5*rand(1,n);
0149   data.x = [[cos(angle), -cos(angle(1:floor(n/2)))]; height;[ sin(angle), 2-sin(angle)]];
0150   data.col = [angle, 1.5*pi + angle];
0151   data.z = [angle, 1.5*pi+angle; height]; % the ground truth
0152  
0153  case 3 % "square" dataset, a uniformly sampled 2D square randomly
0154          % rotated into higher dimensions
0155   data.name = 'square';
0156   n = data.n;
0157   d = 2;     % intrinsic dimension
0158   % optional parameter for dataset==3
0159   % data.D      dimension of the data
0160   if ~isfield(data, 'D'), data.D = 3; end
0161   % generate random rotation matrix
0162   D = data.D;
0163   A = randn(D, D);
0164   options.disp = 0;
0165   [R, dummy] = eigs(A*A', d, 'LM', options);
0166   tt = rand(d, n);
0167   data.col = tt(1,:);
0168   data.x = R*tt;
0169   data.z = tt;   % the ground truth
0170   data.az = 7;
0171   data.el = 40;
0172   
0173  case 4 % spiral: two dimensional "swiss roll"
0174   data.name = 'spiral';
0175   n = data.n;
0176   tt = (3*pi/2)*(1+2*rand(1, n));
0177   data.col = tt;
0178   data.x = [tt.*cos(tt); tt.*sin(tt)];
0179   data.z = tt; % the ground truth
0180   
0181  case -4 % spiral: two dimensional "swiss roll"
0182   data.name = 'noisy spiral';
0183   n = data.n;
0184   tt = (3*pi/2)*(1+2*rand(1, n));
0185   data.col = tt;
0186   data.x = [tt.*cos(tt); tt.*sin(tt)];
0187   data.x = data.x + randn(size(data.x));
0188   data.z = tt; % the ground truth
0189   
0190  case 5 % hole: a dataset with a hole
0191   data.name = 'hole';
0192   n = data.n;
0193   data.x = rand(2,n) - 0.5;
0194   % punch a rectangular hole at the center
0195   l1 = 0.2; l2 = 0.2;
0196   ok = find((abs(data.x(1,:))>l1) | (abs(data.x(2,:))>l2));
0197   i = length(ok);
0198   data.x(:, 1:i) = data.x(:, ok);
0199   while (i<n)
0200     p = rand(2,1) - 0.5;
0201     if (abs(p(1))>l1) | (abs(p(2))>l2)
0202       i = i + 1;
0203       data.x(:,i) = p;
0204     end
0205   end
0206   data.col = data.x(2,:);
0207   data.z = data.x;
0208   
0209  case 6 % P : taken from Saul's slides
0210   % note that for k=20, isomap and lle work fine which is very different
0211   % from the plots that Saul showed in his slides.
0212   data.name = 'P';
0213   load x
0214   x(2,:) = 500-x(2,:);
0215   data.x = x;
0216   data.z = x;
0217   data.col = data.z(2,:);
0218   data.n = size(x, 2);
0219   
0220  case 7 % fishbowl: uniform in data space
0221   gamma = 0.8;
0222   data.name = 'fishbowl (uniform in data space)';
0223   n = data.n;
0224   data.x = rand(3,n)-0.5;
0225   %project all data onto the surface of the unit sphere
0226   data.x = data.x ./ repmat(sqrt(sum(data.x.*data.x, 1)), [3 1]);
0227   ok = find(data.x(3,:) < gamma);
0228   i = length(ok);
0229   data.x(:, 1:i) = data.x(:, ok);
0230   while (i < n)
0231     p = rand(3,1)-0.5;
0232     p = p / sqrt(p'*p);
0233     if (p(3) < gamma)
0234       i = i+1;
0235       data.x(:, i) = p;
0236     end
0237   end
0238   % the projection on the plane works as follows:
0239   % start a beam from (0,0,1) through each surface point on the sphere
0240   % and look where it hits the xy plane.
0241   data.z = data.x(1:2,:) ./ repmat(1-data.x(3,:), [2 1]);
0242   data.col = data.x(3,:);
0243   data.az = -18;
0244   data.el = 16;
0245  case 8 % fishbowl: uniform in embedding space
0246   data.name = 'fishbowl (uniform in embedding space)';
0247   n = data.n;
0248   data.z = rand(2, n) - 0.5;
0249   % keep the disc
0250   ok = find(sum(data.z .* data.z) <= 0.25);
0251   i = length(ok);
0252   data.z(:, 1:i) = data.z(:, ok);
0253   while (i < n)
0254     p = rand(2,1) - 0.5;
0255     if (p'*p <= 0.25)
0256       i = i + 1;
0257       data.z(:, i) = p;
0258     end
0259   end
0260   gamma = 0.8;  % same role/parameter as in case 7
0261   data.z = 2*sqrt((1+gamma)/(1-gamma))*data.z;
0262   % project the disc onto the sphere
0263   alpha = 2 ./ (1 + sum(data.z .* data.z, 1));
0264   data.x = [repmat(alpha, [2 1]).*data.z; zeros(1, n)];
0265   data.x(3,:) = 1-alpha;
0266   data.col = data.x(3,:);
0267   data.az = -18;
0268   data.el = 16;
0269   
0270  case 9  % a gaussian blob
0271   data.name = 'gaussian blob';
0272   n = data.n;
0273   data.x = randn(3,n);
0274   data.z = data.x(2:3,:);
0275   data.col = data.x(3,:);
0276   
0277 end
0278 
0279 
0280 data.D = size(data.x, 1);  % dimensionality of the data
0281 % finally generate noise
0282 data.e = randn(size(data.x));
0283 data.x_noisefree = data.x;  % the noise free data
0284 data.x = data.x_noisefree + sqrt(data.noise)*data.e;
0285 
0286 % precalculate the distanzmatrix
0287 data.distances = distanz(data.x);
0288 
0289

Generated on Wed 13-Oct-2010 13:36:39 by m2html © 2003